"""
Residualized-KAOPEN test across the portfolio.

For each paper's KAOPEN interaction, runs:
1. Original: Z₁×KAOPEN → DV (replicate)
2. Residualized: KAOPEN purged of income+OECD, then Z₁×KAOPEN_resid → DV
3. Horse race: Z₁×income + Z₁×KAOPEN simultaneously → DV

Uses 140-country panel with year <= 2024.
"""

import sys
sys.path.insert(0, '/mnt/c/demographics_capital_flows/multilateral/140_country/src')

import pandas as pd
import numpy as np
from model import PanelGLS
from pathlib import Path

OUTPUT_DIR = Path('/mnt/c/demographics_capital_flows/fragility/output/tables')

# ── Load data ──────────────────────────────────────────────────────────────
df = pd.read_csv('/mnt/c/demographics_capital_flows/multilateral/140_country/data/processed/full_panel.csv')
df = df[df['year'] <= 2024].copy()

# OECD membership indicator
OECD = {'AUS','AUT','BEL','CAN','CHL','COL','CRI','CZE','DNK','EST','FIN','FRA',
        'DEU','GRC','HUN','ISL','IRL','ISR','ITA','JPN','KOR','LVA','LTU','LUX',
        'MEX','NLD','NZL','NOR','POL','PRT','SVK','SVN','ESP','SWE','CHE','TUR',
        'GBR','USA'}
df['oecd'] = df['iso3'].isin(OECD).astype(float)
df['log_gdp_pc'] = np.log(df['gdp_pc_ppp'].clip(lower=100))

# ── Residualize KAOPEN ─────────────────────────────────────────────────────
mask_resid = df[['kaopen','log_gdp_pc','oecd']].notna().all(axis=1)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(df.loc[mask_resid, ['log_gdp_pc','oecd']], df.loc[mask_resid, 'kaopen'])
df.loc[mask_resid, 'kaopen_resid'] = df.loc[mask_resid, 'kaopen'] - lr.predict(df.loc[mask_resid, ['log_gdp_pc','oecd']])

# Create interactions
df['Z1_x_kaopen'] = df['Z_1'] * df['kaopen']
df['Z1_x_kaopen_resid'] = df['Z_1'] * df['kaopen_resid']
df['Z1_x_log_gdp_pc'] = df['Z_1'] * df['log_gdp_pc']

# ── EBA-style controls ─────────────────────────────────────────────────────
eba_controls = ['nfa_gdp_lag', 'nfa_gdp_lag_sq', 'nfa_positive', 'nfa_negative',
                'output_gap', 'rgdp_growth', 'fiscal_bal_gdp', 'old_dep',
                'rel_output_per_worker', 'log_rel_opw']

def run_test(dv, label, extra_controls=None):
    """Run original, residualized, and horse race for a given DV."""
    controls = [c for c in eba_controls if c in df.columns]
    if extra_controls:
        controls = controls + [c for c in extra_controls if c in df.columns]

    results = {}

    # Spec 1: Original Z₁×KAOPEN
    xcols_orig = ['Z_1','Z_2','Z_3','kaopen','Z1_x_kaopen'] + controls
    sub = df[['iso3','year',dv] + xcols_orig].dropna()
    if len(sub) < 100:
        print(f"  SKIP {label}: only {len(sub)} obs")
        return None

    gls = PanelGLS()
    gls.fit(sub[dv].values, sub[xcols_orig].values, sub['iso3'].values, sub['year'].values)
    idx_kaopen = xcols_orig.index('Z1_x_kaopen')
    results['orig_coef'] = gls.beta[idx_kaopen]
    results['orig_se'] = gls.se[idx_kaopen]
    results['orig_p'] = gls.pvalues[idx_kaopen]
    results['orig_n'] = gls.n_obs

    # Spec 2: Residualized KAOPEN
    xcols_resid = ['Z_1','Z_2','Z_3','kaopen_resid','Z1_x_kaopen_resid'] + controls
    sub2 = df[['iso3','year',dv] + xcols_resid].dropna()

    gls2 = PanelGLS()
    gls2.fit(sub2[dv].values, sub2[xcols_resid].values, sub2['iso3'].values, sub2['year'].values)
    idx_resid = xcols_resid.index('Z1_x_kaopen_resid')
    results['resid_coef'] = gls2.beta[idx_resid]
    results['resid_se'] = gls2.se[idx_resid]
    results['resid_p'] = gls2.pvalues[idx_resid]
    results['resid_n'] = gls2.n_obs

    # Spec 3: Horse race (Z₁×income + Z₁×KAOPEN)
    xcols_horse = ['Z_1','Z_2','Z_3','kaopen','log_gdp_pc',
                   'Z1_x_kaopen','Z1_x_log_gdp_pc'] + controls
    sub3 = df[['iso3','year',dv] + xcols_horse].dropna()

    gls3 = PanelGLS()
    gls3.fit(sub3[dv].values, sub3[xcols_horse].values, sub3['iso3'].values, sub3['year'].values)
    idx_k3 = xcols_horse.index('Z1_x_kaopen')
    idx_i3 = xcols_horse.index('Z1_x_log_gdp_pc')
    results['horse_kaopen_coef'] = gls3.beta[idx_k3]
    results['horse_kaopen_p'] = gls3.pvalues[idx_k3]
    results['horse_income_coef'] = gls3.beta[idx_i3]
    results['horse_income_p'] = gls3.pvalues[idx_i3]
    results['horse_n'] = gls3.n_obs

    return results


# ── Run tests across papers ────────────────────────────────────────────────
tests = [
    ('ca_gdp', 'Paper 1: Multilateral (Z₁×KAOPEN → CA/GDP)', None),
    ('gross_national_savings_gdp', 'Paper 1: Multilateral (Z₁×KAOPEN → Savings/GDP)', None),
    ('gross_investment_gdp', 'Paper 1: Multilateral (Z₁×KAOPEN → Investment/GDP)', None),
    ('gross_fixed_investment_gdp', 'Paper 4/11: Capital Deepening (Z₁×KAOPEN → Fixed I/GDP)', None),
    ('ca_gdp', 'Paper 13: Net/Gross (Z₁×KAOPEN → CA, confirming sign-flip)', None),
]

print("=" * 80)
print("RESIDUALIZED-KAOPEN TEST ACROSS PORTFOLIO")
print("=" * 80)

all_results = []
for dv, label, extra in tests:
    print(f"\n{label}")
    print("-" * 60)
    r = run_test(dv, label, extra)
    if r:
        r['test'] = label
        r['dv'] = dv
        all_results.append(r)

        def stars(p):
            if p < 0.01: return '***'
            elif p < 0.05: return '**'
            elif p < 0.10: return '*'
            return ''

        print(f"  Original Z₁×KAOPEN:      β={r['orig_coef']:8.2f} (p={r['orig_p']:.4f}{stars(r['orig_p'])}), n={r['orig_n']}")
        print(f"  Residualized Z₁×KAOPEN:   β={r['resid_coef']:8.2f} (p={r['resid_p']:.4f}{stars(r['resid_p'])}), n={r['resid_n']}")
        print(f"  Horse race KAOPEN:         β={r['horse_kaopen_coef']:8.2f} (p={r['horse_kaopen_p']:.4f}{stars(r['horse_kaopen_p'])})")
        print(f"  Horse race Income:         β={r['horse_income_coef']:8.2f} (p={r['horse_income_p']:.4f}{stars(r['horse_income_p'])}), n={r['horse_n']}")

# ── Additional test: KAOPEN level prediction (Paper 9B) ───────────────────
print(f"\n{'=' * 80}")
print("KAOPEN LEVEL PREDICTION (Paper 9B replication)")
print("-" * 60)

sub_kl = df[['iso3','year','kaopen','Z_1','Z_2','Z_3','log_gdp_pc','oecd'] +
            [c for c in eba_controls if c in df.columns]].dropna()
xcols_kl = ['Z_1','Z_2','Z_3'] + [c for c in eba_controls if c in df.columns]

gls_kl = PanelGLS()
gls_kl.fit(sub_kl['kaopen'].values, sub_kl[xcols_kl].values, sub_kl['iso3'].values, sub_kl['year'].values)
print(f"  Z₁ → KAOPEN (no income):  β={gls_kl.beta[0]:8.4f} (p={gls_kl.pvalues[0]:.4f}), n={gls_kl.n_obs}")

xcols_kl2 = ['Z_1','Z_2','Z_3','log_gdp_pc','oecd'] + [c for c in eba_controls if c in df.columns]
sub_kl2 = df[['iso3','year','kaopen'] + xcols_kl2].dropna()
gls_kl2 = PanelGLS()
gls_kl2.fit(sub_kl2['kaopen'].values, sub_kl2[xcols_kl2].values, sub_kl2['iso3'].values, sub_kl2['year'].values)
print(f"  Z₁ → KAOPEN (with income): β={gls_kl2.beta[0]:8.4f} (p={gls_kl2.pvalues[0]:.4f}), n={gls_kl2.n_obs}")

# ── Correlation check ──────────────────────────────────────────────────────
print(f"\n{'=' * 80}")
print("CORRELATION DIAGNOSTICS")
print("-" * 60)
corr_sub = df[['kaopen','log_gdp_pc','oecd']].dropna()
print(f"  KAOPEN vs log(GDP/pc):   r = {corr_sub['kaopen'].corr(corr_sub['log_gdp_pc']):.3f}")
print(f"  KAOPEN vs OECD:          r = {corr_sub['kaopen'].corr(corr_sub['oecd']):.3f}")
ceiling = (df['kaopen'] >= df['kaopen'].max() - 0.01).sum() / df['kaopen'].notna().sum()
oecd_ceiling = (df.loc[df['oecd']==1, 'kaopen'] >= df['kaopen'].max() - 0.01).sum() / df.loc[df['oecd']==1, 'kaopen'].notna().sum()
print(f"  KAOPEN ceiling bunching (all):  {ceiling:.1%}")
print(f"  KAOPEN ceiling bunching (OECD): {oecd_ceiling:.1%}")

# ── Write output table ─────────────────────────────────────────────────────
lines = ['# Table 6: Residualized-KAOPEN Test Across Portfolio\n']
lines.append('Tests whether Z₁×KAOPEN interactions survive after purging KAOPEN of income and OECD effects.\n')
lines.append('| Test | DV | Orig β | Orig p | Resid β | Resid p | Horse KAOPEN p | Horse Income p | N |')
lines.append('|:---|:---|---:|---:|---:|---:|---:|---:|---:|')

for r in all_results:
    short = r['test'].split('(')[0].strip()
    lines.append(f"| {short} | {r['dv']} | {r['orig_coef']:.2f} | {r['orig_p']:.4f} | {r['resid_coef']:.2f} | {r['resid_p']:.4f} | {r['horse_kaopen_p']:.4f} | {r['horse_income_p']:.4f} | {r['orig_n']} |")

lines.append('')
lines.append('## Interpretation')
lines.append('')
lines.append('If the residualized p-value is >0.10 and the horse race kills KAOPEN while income survives,')
lines.append('the original Z₁×KAOPEN interaction was spurious — driven by KAOPEN\'s correlation with income.')

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / 'table6_residualized_kaopen.md').write_text('\n'.join(lines))
print(f"\nTable written to {OUTPUT_DIR / 'table6_residualized_kaopen.md'}")
